
# Read in data
#setwd("C:\\Documents and Settings\\hap7\\My Documents\\research\\ELPUB\\")
setwd("Documents")
setwd("ELPUB 2008")
dat = read.csv("ELPUB 2008 Piwowar data.csv", header=TRUE, row.names=1)

# set the NAs to be zero so that we can compute with the subdisciplines
dat[is.na(dat)] = 0

dim(dat)
rownames(dat)
colnames(dat)

median.IF = median(dat$Impact.Factor, na.rm=TRUE)

# calculate some useful fields
dat$percent.microarray = dat$Record.Count/dat$Articles
 
# A quick summary of the data

### Impact factor
table(dat$strength.tri)
windows()
quartz()
boxplot(Impact.Factor ~ strength.tri,
        data = dat,
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"), 
        ylab = "Journal impact factor", outline=T, notch=F, log="y",
	  horizontal=FALSE)

print("Strength of Policy")
print("Median impact factor")
tapply(dat$Impact.Factor, dat$strength.tri, median)

### Publisher
print("Strength of Policy")
print("Association, Commercial")
tt = table(dat$strength.tri, dat$Published.by.Association)
tt
# Distribution of Published.by.Association journals across policy strength categories
round(tt[,2]/(tt[,1]+tt[,2]), 2)
# Percent with no policy
round(tt[1,]/(tt[1,]+tt[2,]+tt[3,]), 2)

### Open access
print("Strength of Policy")
print("Closed, Open")
tt = table(dat$strength.tri, dat$open.access)
tt
# Distribution of open access journals across policy strength categories
round(tt[,2]/(tt[,1]+tt[,2]), 2)
# Percent with no policy
round(tt[1,]/(tt[1,]+tt[2,]+tt[3,]), 2)

#### Prevalence
table(dat$strength.tri)
windows()
quartz()
boxplot(conserv.GEO.percent ~ strength.tri,
        data = dat,
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"), 
        ylab = "Data sharing prevalence within journals", outline=T, notch=F,
	  horizontal=FALSE)

## Breakdown by impact factor
windows()
quartz()
boxplot(conserv.GEO.percent ~ strength.tri,
        data = dat,
        subset = dat$ln.impact.factor <= median(dat$ln.impact.factor),
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"),
        ylab = "Data sharing prevalence within journals", outline=T, notch=F,
	  horizontal=FALSE)

windows()
quartz()
boxplot(conserv.GEO.percent ~ strength.tri,
        data = dat,
        subset = dat$ln.impact.factor > median(dat$ln.impact.factor),
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"),
        ylab = "Data sharing prevalence within journals", outline=T, notch=F,
	  horizontal=FALSE)

## Breakdown by association
windows()
quartz()
boxplot(conserv.GEO.percent ~ strength.tri,
        data = dat,
        subset = dat$Published.by.Association <= 0,
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"), 
        ylab = "Data sharing prevalence within journals", outline=T, notch=F,
	  horizontal=FALSE)

windows()
quartz()
boxplot(conserv.GEO.percent ~ strength.tri,
        data = dat,
        subset = dat$Published.by.Association > 0,
        boxwex = 0.5, 
        names=c("No data sharing policy", "Weak policy", "Strong policy"), 
        ylab = "Data sharing prevalence within journals", outline=T, notch=F,
	  horizontal=FALSE)
	  
	  
print("Strength of Policy")
print("Median data sharing prevalence")
tapply(dat$conserv.GEO.percent, dat$strength.tri, median, na.rm=T)




### Multivariate analysis

## Primary analysis

result.primary = lm(strength.bi
  ~ ln.impact.factor. + Published.by.Association + open.access + BIOCHEMISTRY...MOLECULAR.BIOLOGY + BIOTECHNOLOGY...APPLIED.MICROBIOLOGY + PLANT.SCIENCES + ONCOLOGY + CELL.BIOLOGY + GENETICS...HEREDITY + MULTIDISCIPLINARY.SCIENCES, 
	dat)
summary(result.primary)


## Prevalence
result.primary = lm(conserv.GEO.percent
  ~ strength.bi + ln.impact.factor. + Published.by.Association + open.access + BIOCHEMISTRY...MOLECULAR.BIOLOGY + BIOTECHNOLOGY...APPLIED.MICROBIOLOGY + PLANT.SCIENCES + ONCOLOGY + CELL.BIOLOGY + GENETICS...HEREDITY + MULTIDISCIPLINARY.SCIENCES, 
	dat)
summary(result.primary)

